Data Mining Final Project

Author

RAVLEEN KAUR CHADHA, MRIDULA KALAISELVAN, YASH SHARMA

if(!require(pacman)) 
  install.packages("pacman") 

pacman::p_load(tidyverse, 
               scales, 
               devtools,
               here,
               class, 
               plotly,
               dplyr,
               caret,
               BiocManager,
               smotefamily,
               pROC, 
               mclust, 
               factoextra, 
               cluster, 
               dbscan,
               arules,
               arulesViz,
               themis,
               recipes,
               e1071, 
               GGally
               )
# Reading the dataset
hepatitis <- read_csv("HepatitisCdata.csv")
# Handling the NA values by replacing them by the median of each column
hepatitis <- hepatitis %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

# Verifying that all NA values have been handled
colSums(is.na(hepatitis))
    ...1 Category      Age      Sex      ALB      ALP      ALT      AST 
       0        0        0        0        0        0        0        0 
     BIL      CHE     CHOL     CREA      GGT     PROT 
       0        0        0        0        0        0 
write.csv(hepatitis, "HepatitisC_Cleaned.csv", row.names = FALSE)
# EDA for the dataset

# Defining a colorblind-friendly palette

color_palette <- c("#440154FF", "#3B528BFF", "#21908CFF", "#5DC863FF", "#FDE725FF")

# Creating an enhanced interactive scatterplot

p <- plot_ly(
  data = hepatitis,
  x = ~AST, 
  y = ~ALT, 
  type = 'scatter', 
  mode = 'markers',
  color = ~as.factor(Category),
  colors = color_palette,
  marker = list(
    size = 8,
    opacity = 0.8
  ),
  text = ~paste(
    "Category:", case_when(
      Category == 0 ~ "Blood Donor",
      Category == 1 ~ "Suspect Blood Donor",
      Category == 2 ~ "Hepatitis",
      Category == 3 ~ "Fibrosis",
      Category == 4 ~ "Cirrhosis",
      TRUE ~ as.character(Category)
    ),
    "<br>AST:", AST,
    "<br>ALT:", ALT,
    "<br>Age:", Age
  )
) %>%
  layout(
    title = list(
      text = "Interactive Scatterplot: AST vs ALT by Category",
      font = list(size = 18)
    ),
    xaxis = list(
      title = "AST (Aspartate Transaminase)",
      titlefont = list(size = 14),
      zeroline = FALSE,
      showgrid = FALSE
    ),
    yaxis = list(
      title = "ALT (Alanine Transaminase)",
      titlefont = list(size = 14),
      zeroline = FALSE,
      showgrid = FALSE
    ),
    legend = list(
      title = list(text = "Category"),
      font = list(size = 12),
      orientation = "v",
      x = 1.1,
      y = 0.5,
      xanchor = "left"
    ),
    plot_bgcolor = "#FFFFFF",
    paper_bgcolor = "#FFFFFF"
  )

# Updating legend labels after plot creation

p <- p %>% layout(
  legend = list(
    title = list(text = "Category"),
    font = list(size = 12),
    orientation = "v",
    x = 1.1,
    y = 0.5,
    xanchor = "left",
    traceorder = "normal",
    itemsizing = "constant",
    labels = list(
      "0" = "Blood Donor",
      "1" = "Suspect Blood Donor",
      "2" = "Hepatitis",
      "3" = "Fibrosis",
      "4" = "Cirrhosis"
    )
  )
)

# Displaying the plot
p

Classification Analysis for the dataset Hepatitis-C - Logistic Regression

# Load the dataset
hepatitis <- read.csv("HepatitisC_Cleaned.csv")

# Convert 'Category' into a binary variable (e.g., "Blood Donor" vs. others)
hepatitis <- hepatitis %>%
  mutate(BinaryCategory = ifelse(Category == "0=Blood Donor", 1, 0))

# Encode 'Sex' as a numeric variable (e.g., "m" -> 1, "f" -> 0)
hepatitis <- hepatitis %>%
  mutate(Sex = ifelse(Sex == "m", 1, 0))

# Exclude unnecessary columns
hepatitis <- hepatitis %>%
  select(-...1, -Category)  # Remove index and original 'Category'

# Ensure the target variable is a factor
hepatitis$BinaryCategory <- as.factor(hepatitis$BinaryCategory)

# Split the dataset into training and testing sets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(hepatitis$BinaryCategory, p = 0.8, list = FALSE)
train_data <- hepatitis[train_index, ]
test_data <- hepatitis[-train_index, ]

# Create a recipe for SMOTE
smote_recipe <- recipe(BinaryCategory ~ ., data = train_data) %>%
  step_smote(BinaryCategory, over_ratio = 1)  # Balance classes to a 1:1 ratio

# Prepare the SMOTE dataset
smote_data <- prep(smote_recipe) %>%
  bake(new_data = NULL)

# Check the class distribution after SMOTE
print("Class distribution after applying SMOTE:")
[1] "Class distribution after applying SMOTE:"
print(table(smote_data$BinaryCategory))

  0   1 
427 427 
# Train logistic regression on the SMOTE-balanced dataset
smote_logistic_model <- glm(BinaryCategory ~ ., data = smote_data, family = "binomial")

# Summarize the model
print("Logistic Regression Model Summary:")
[1] "Logistic Regression Model Summary:"
print(summary(smote_logistic_model))

Call:
glm(formula = BinaryCategory ~ ., family = "binomial", data = smote_data)

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  6.246494   3.005821   2.078 0.037697 *  
Age          0.055334   0.019529   2.833 0.004605 ** 
Sex          1.277596   0.462124   2.765 0.005699 ** 
ALB          0.170787   0.051974   3.286 0.001016 ** 
ALP          0.091808   0.011753   7.812 5.65e-15 ***
ALT          0.033121   0.007777   4.259 2.06e-05 ***
AST         -0.163989   0.021844  -7.507 6.04e-14 ***
BIL         -0.084453   0.025373  -3.328 0.000873 ***
CHE         -0.239305   0.104772  -2.284 0.022368 *  
CHOL         0.646587   0.210365   3.074 0.002115 ** 
CREA        -0.024328   0.004495  -5.412 6.22e-08 ***
GGT         -0.038332   0.006404  -5.986 2.15e-09 ***
PROT        -0.182912   0.040763  -4.487 7.22e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1183.90  on 853  degrees of freedom
Residual deviance:  251.74  on 841  degrees of freedom
AIC: 277.74

Number of Fisher Scoring iterations: 9
# Predict on the test set
smote_predictions <- predict(smote_logistic_model, newdata = test_data, type = "response")

# Convert probabilities to class labels (threshold = 0.5)
smote_predicted_classes <- ifelse(smote_predictions > 0.5, 1, 0)

# Evaluate the model
smote_confusion_matrix <- confusionMatrix(as.factor(smote_predicted_classes), as.factor(test_data$BinaryCategory))
print("Confusion Matrix after applying SMOTE:")
[1] "Confusion Matrix after applying SMOTE:"
print(smote_confusion_matrix)
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 14  7
         1  2 99
                                          
               Accuracy : 0.9262          
                 95% CI : (0.8646, 0.9657)
    No Information Rate : 0.8689          
    P-Value [Acc > NIR] : 0.03364         
                                          
                  Kappa : 0.7142          
                                          
 Mcnemar's Test P-Value : 0.18242         
                                          
            Sensitivity : 0.8750          
            Specificity : 0.9340          
         Pos Pred Value : 0.6667          
         Neg Pred Value : 0.9802          
             Prevalence : 0.1311          
         Detection Rate : 0.1148          
   Detection Prevalence : 0.1721          
      Balanced Accuracy : 0.9045          
                                          
       'Positive' Class : 0               
                                          
# Plot ROC curve for the SMOTE model
smote_roc_curve <- roc(as.numeric(test_data$BinaryCategory), smote_predictions)
plot(smote_roc_curve, col = "red", main = "ROC Curve for Logistic Regression (SMOTE)")

Probability Distribution Visualization for Logistic Regression

test_data$Predicted_Probabilities <- smote_predictions
ggplot(test_data, aes(x = Predicted_Probabilities, fill = BinaryCategory)) +
  geom_density(alpha = 0.5) +
  labs(title = "Probability Distribution for Logistic Regression",
       x = "Predicted Probability",
       y = "Density",
       fill = "Actual Class") +
  theme_minimal()

GMM Clustering

# Step 1: Load the dataset
# Assuming the data has been preprocessed (scaled, PCA applied, and features weighted)
hepatitis_data <- read.csv("HepatitisC_Cleaned.csv")

# Select numeric columns for clustering
numeric_data <- hepatitis_data[sapply(hepatitis_data, is.numeric)]
scaled_data <- scale(numeric_data)

# Perform PCA for dimensionality reduction
pca <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
pca_data <- pca$x[, 1:2]  # Retain the first two principal components

# Step 2: Apply Gaussian Mixture Models (GMM)
set.seed(123)  # Ensure reproducibility
gmm_model <- Mclust(pca_data)

# Optimal number of clusters
cat("Optimal Number of Clusters (GMM):", gmm_model$G, "\n")
Optimal Number of Clusters (GMM): 2 
# Step 3: Evaluate Clustering Quality
# Silhouette Score
silhouette_scores <- silhouette(gmm_model$classification, dist(pca_data))
avg_silhouette <- mean(silhouette_scores[, 3])
cat("Average Silhouette Score (GMM):", round(avg_silhouette, 2), "\n")
Average Silhouette Score (GMM): 0.62 
# Adjusted Rand Index (ARI)
if ("Category" %in% colnames(hepatitis_data)) {
  actual_labels <- as.numeric(factor(hepatitis_data$Category))  # Replace 'Category' with your actual label column
  ari <- adjustedRandIndex(actual_labels, gmm_model$classification)
  cat("Adjusted Rand Index (ARI):", round(ari, 2), "\n")
} else {
  cat("No actual labels provided for ARI calculation.\n")
}
Adjusted Rand Index (ARI): 0.63 
# Step 5: Add Cluster Assignments to Dataset
hepatitis_data$Cluster <- as.factor(gmm_model$classification)

# Step 6: Analyze Clusters
# Summary statistics for each cluster
cluster_summary <- hepatitis_data %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE))

# Enhanced GMM Clustering Visualization with Labels
ggplot(as.data.frame(pca_data), aes(x = PC1, y = PC2, color = as.factor(gmm_model$classification))) +
  geom_point(size = 3, alpha = 0.7) +  # Data points
  stat_ellipse(aes(fill = as.factor(gmm_model$classification)), alpha = 0.2, geom = "polygon") +  # Ellipses
  scale_color_manual(values = c("#4CAF50", "#FF5722"), name = "Cluster",
                     labels = c("Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
  scale_fill_manual(values = c("#4CAF50", "#FF5722"), name = "Cluster",
                    labels = c("Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
  labs(
    title = "Gaussian Mixture Model Clustering",
    x = "Principal Component 1",
    y = "Principal Component 2"
  ) +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "right",
    legend.text = element_text(size = 10), 
    axis.title.x = element_text(size = 10), 
    axis.title.y = element_text(size = 10)
  )

DBSCAN Clustering

# Scale the data
scaled_data <- scale(pca_data)

# Apply DBSCAN with eps = 1.0 and minPts = 5 (rule of thumb: dimensions + 1)
dbscan_model <- dbscan(scaled_data, eps = 1, minPts = 5)

# Calculate cluster sizes 
cluster_sizes <- table(dbscan_model$cluster)

# Calculate the percentage of noise
noise_percentage <- sum(dbscan_model$cluster == 0) / nrow(scaled_data) * 100

# Compute silhouette scores
silhouette_scores <- silhouette(dbscan_model$cluster, dist(scaled_data))
avg_silhouette <- mean(silhouette_scores[, 3])

# Visualize DBSCAN clusters
ggplot(as.data.frame(pca_data), aes(x = PC1, y = PC2, color = as.factor(dbscan_model$cluster))) +
  geom_point(size = 3, alpha = 0.7) +
  scale_color_manual(
    values = c("#999999", "#4CAF50", "#FF5722"),
    name = "Cluster",
    labels = c("Noise", "Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
  labs(
    title = "DBSCAN Clustering Results",
    x = "Principal Component 1",
    y = "Principal Component 2"
  ) +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "right",
    legend.text = element_text(size = 10), 
    axis.title.x = element_text(size = 10), 
    axis.title.y = element_text(size = 10)
  )

cat("DBSCAN Results Summary:\n")
DBSCAN Results Summary:
cat("Number of Clusters:", length(unique(dbscan_model$cluster)) - 1, "\n")  # Exclude noise cluster
Number of Clusters: 2 
cat("Noise Percentage:", round(noise_percentage, 2), "%\n")
Noise Percentage: 2.11 %
cat("Average Silhouette Score:", round(avg_silhouette, 2), "\n")
Average Silhouette Score: 0.65 
cat("\nAdditional Insights:\n")

Additional Insights:
cat("- Noise (Cluster 0) contains", cluster_sizes[1], "points.\n")
- Noise (Cluster 0) contains 13 points.
cat("- Largest cluster size:", max(cluster_sizes[-1]), "points (excluding noise).\n")
- Largest cluster size: 598 points (excluding noise).
cat("- Smallest cluster size:", min(cluster_sizes[-1]), "points (excluding noise).\n")
- Smallest cluster size: 4 points (excluding noise).

Classification Analysis for the dataset Hepatitis-C - SVM

# Load the dataset
hepatitis <- read.csv("HepatitisC_Cleaned.csv")

# Preprocessing
# Encode 'Sex' as a factor
hepatitis$Sex <- as.factor(hepatitis$Sex)

# Convert 'Category' to a factor for multi-class classification
hepatitis$Category <- as.factor(hepatitis$Category)

# Split the data into training and testing sets
set.seed(123)  # For reproducibility
trainIndex <- createDataPartition(hepatitis$Category, p = 0.8, list = FALSE)
trainData <- hepatitis[trainIndex, ]
testData <- hepatitis[-trainIndex, ]

# Identify numeric features
numeric_features <- names(which(sapply(trainData, is.numeric)))

# Scale numeric features
scaled_train <- trainData
scaled_test <- testData

for (col in numeric_features) {
  # Get scaling parameters from the training set
  col_mean <- mean(trainData[[col]], na.rm = TRUE)
  col_sd <- sd(trainData[[col]], na.rm = TRUE)
  
  # Scale training and test data
  scaled_train[[col]] <- (trainData[[col]] - col_mean) / col_sd
  scaled_test[[col]] <- (testData[[col]] - col_mean) / col_sd
}

# Train the SVM model with a radial basis function kernel
svm_model <- svm(Category ~ ., data = scaled_train, kernel = "radial", cost = 1, gamma = 0.1)

# Predict on the test set
svm_predictions <- predict(svm_model, newdata = scaled_test)

# Evaluate the model
conf_matrix <- confusionMatrix(svm_predictions, scaled_test$Category)
print("Confusion Matrix for SVM:")
[1] "Confusion Matrix for SVM:"
print(conf_matrix)
Confusion Matrix and Statistics

                        Reference
Prediction               0=Blood Donor 0s=suspect Blood Donor 1=Hepatitis
  0=Blood Donor                    106                      0           0
  0s=suspect Blood Donor             0                      0           0
  1=Hepatitis                        0                      0           3
  2=Fibrosis                         0                      0           0
  3=Cirrhosis                        0                      1           1
                        Reference
Prediction               2=Fibrosis 3=Cirrhosis
  0=Blood Donor                   0           0
  0s=suspect Blood Donor          0           0
  1=Hepatitis                     0           0
  2=Fibrosis                      3           1
  3=Cirrhosis                     1           5

Overall Statistics
                                          
               Accuracy : 0.9669          
                 95% CI : (0.9175, 0.9909)
    No Information Rate : 0.876           
    P-Value [Acc > NIR] : 0.0004865       
                                          
                  Kappa : 0.8546          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 0=Blood Donor Class: 0s=suspect Blood Donor
Sensitivity                         1.000                      0.000000
Specificity                         1.000                      1.000000
Pos Pred Value                      1.000                           NaN
Neg Pred Value                      1.000                      0.991736
Prevalence                          0.876                      0.008264
Detection Rate                      0.876                      0.000000
Detection Prevalence                0.876                      0.000000
Balanced Accuracy                   1.000                      0.500000
                     Class: 1=Hepatitis Class: 2=Fibrosis Class: 3=Cirrhosis
Sensitivity                     0.75000           0.75000            0.83333
Specificity                     1.00000           0.99145            0.97391
Pos Pred Value                  1.00000           0.75000            0.62500
Neg Pred Value                  0.99153           0.99145            0.99115
Prevalence                      0.03306           0.03306            0.04959
Detection Rate                  0.02479           0.02479            0.04132
Detection Prevalence            0.02479           0.03306            0.06612
Balanced Accuracy               0.87500           0.87073            0.90362
# Specify the features to include in the plot
selected_features <- c("Age", "ALB", "ALP", "ALT", "AST", "BIL")

# Filter the dataset for specific categories
filtered_data <- scaled_train %>%
  filter(Category %in% c("1=Hepatitis", "2=Fibrosis", "3=Cirrhosis"))  # Assuming "1" = Hepatitis, "2" = Fibrosis, "3" = Cirrhosis

# Parallel coordinates plot for selected features and categories
ggparcoord(data = filtered_data,
           columns = which(names(filtered_data) %in% selected_features),
           groupColumn = "Category",
           scale = "std", alphaLines = 0.5) +
  labs(title = "Parallel Coordinates Plot for Hepatitis, Fibrosis, and Cirrhosis",
       x = "Features", y = "Scaled Values") +
  theme_minimal(base_size = 14)

Association rule using Apriori Algorithm

# 1. Data Preparation
hepatitis_categorized <- hepatitis %>%
  mutate(
    Age_Group = case_when(
      Age < 35 ~ "Young",
      Age < 50 ~ "Middle",
      Age < 65 ~ "Senior",
      TRUE ~ "Elderly"
    ),
    ALT_Level = case_when(
      ALT < 40 ~ "Normal",
      ALT < 80 ~ "Mild_Elevation",
      ALT < 200 ~ "Moderate_Elevation",
      TRUE ~ "Severe_Elevation"
    ),
    AST_Level = case_when(
      AST < 40 ~ "Normal",
      AST < 80 ~ "Mild_Elevation",
      AST < 200 ~ "Moderate_Elevation",
      TRUE ~ "Severe_Elevation"
    ),
    Disease_Status = case_when(
      Category == 0 ~ "Blood_Donor",
      Category == 1 ~ "Suspect",
      Category == 2 ~ "Hepatitis",
      Category == 3 ~ "Fibrosis",
      Category == 4 ~ "Cirrhosis"
    )
  )

# 2. Create transactions
hepatitis_trans <-
  as(data.frame(select(hepatitis_categorized, 
    Age_Group, ALT_Level, AST_Level, Disease_Status)), 
    "transactions")

# 3. Apply Apriori algorithm
rules <- apriori(hepatitis_trans,
    parameter = list(
        support = 0.03,
        confidence = 0.5,
        minlen = 2,
        maxlen = 4
    ))
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.5    0.1    1 none FALSE            TRUE       5    0.03      2
 maxlen target  ext
      4  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 18 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[12 item(s), 615 transaction(s)] done [0.00s].
sorting and recoding items ... [9 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [25 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].
# 4. Process rules
rules_pruned <- 
  rules[!is.redundant(rules)]
rules_sorted <- 
  sort(rules_pruned, by = "lift", decreasing = TRUE)

# 5. Create rules dataframe
rules_df <- data.frame(
    lhs = labels(lhs(rules_sorted)),
    rhs = labels(rhs(rules_sorted)),
    support = quality(rules_sorted)$support,
    confidence = quality(rules_sorted)$confidence,
    lift = quality(rules_sorted)$lift
)

# 6. Visualization
# Scatter Plot
scatter_plot <- plot_ly(rules_df, 
        x = ~support,
        y = ~confidence,
        color = ~lift,
        type = "scatter",
        mode = "markers",
        marker = list(size = 10),
        text = ~paste("Rule:", lhs, "=>", rhs)) %>%
    layout(
        title = "Apriori Algorithm",
        xaxis = list(title = "Support"),
        yaxis = list(title = "Confidence"),
        colorbar = list(title = "Lift")
    )



# 7. Print Quality Assessment
cat("\nAssociation Rule Analysis Summary:")

Association Rule Analysis Summary:
cat("\nNumber of Rules:", nrow(rules_df))

Number of Rules: 21
cat("\nAverage Confidence (Accuracy):", round(mean(rules_df$confidence), 4))

Average Confidence (Accuracy): 0.7977
cat("\nAverage Lift:", round(mean(rules_df$lift), 4))

Average Lift: 1.0168
cat("\nAverage Support:", round(mean(rules_df$support), 4))

Average Support: 0.2092
# Print top 10 rules
cat("\n\nTop 10 Rules by Lift:\n")


Top 10 Rules by Lift:
head(rules_df[order(-rules_df$lift), 
              c("lhs", "rhs", "support", "confidence", "lift")], 10)
                                           lhs                rhs    support
1  {ALT_Level=Mild_Elevation,AST_Level=Normal} {Age_Group=Middle} 0.05691057
2                   {ALT_Level=Mild_Elevation} {Age_Group=Middle} 0.07154472
3           {Age_Group=Young,AST_Level=Normal} {ALT_Level=Normal} 0.08780488
4                   {AST_Level=Mild_Elevation} {Age_Group=Middle} 0.05040650
5         {Age_Group=Elderly,AST_Level=Normal} {ALT_Level=Normal} 0.03577236
6           {Age_Group=Young,ALT_Level=Normal} {AST_Level=Normal} 0.08780488
7          {Age_Group=Middle,ALT_Level=Normal} {AST_Level=Normal} 0.35934959
8          {Age_Group=Senior,AST_Level=Normal} {ALT_Level=Normal} 0.26829268
9                           {ALT_Level=Normal} {AST_Level=Normal} 0.75121951
10                          {AST_Level=Normal} {ALT_Level=Normal} 0.75121951
   confidence     lift
1   0.6034483 1.258036
2   0.5789474 1.206958
3   0.9473684 1.120445
4   0.5254237 1.095375
5   0.9166667 1.084135
6   0.9152542 1.082464
7   0.9132231 1.080062
8   0.9016393 1.066362
9   0.8884615 1.050777
10  0.8884615 1.050777
# Display plots
scatter_plot

CHARM Algorithm

# 1. Data Preprocessing
hepatitis_categorized <- hepatitis %>%
  mutate(
    Age_Group = case_when(
      Age < 35 ~ "Young",
      Age < 50 ~ "Middle",
      Age < 65 ~ "Senior",
      TRUE ~ "Elderly"
    ),
    ALT_Level = case_when(
      ALT < 40 ~ "ALT_Normal",
      ALT < 80 ~ "ALT_Mild_Elevation",
      ALT < 200 ~ "ALT_Moderate_Elevation",
      TRUE ~ "ALT_Severe_Elevation"
    ),
    AST_Level = case_when(
      AST < 40 ~ " AST_Normal",
      AST < 80 ~ " AST_Mild_Elevation",
      AST < 200 ~ " AST_Moderate_Elevation",
      TRUE ~ " AST_Severe_Elevation"
    ),
    Disease_Status = case_when(
      Category == 0 ~ "Blood_Donor",
      Category == 1 ~ "Suspect",
      Category == 2 ~ "Hepatitis",
      Category == 3 ~ "Fibrosis",
      Category == 4 ~ "Cirrhosis"
    )
  )

# 2. CHARM Implementation
charm_association_rules <- function(data, min_support = 0.05, min_confidence = 0.5) {
    # Convert to transaction format
    trans_matrix <- as.matrix(data)
    n_trans <- nrow(trans_matrix)
    
    # Find unique items
    items <- unique(as.character(trans_matrix))
    cat("Initial items:", length(items), "\n")
    
    # Create vertical database (tid-lists)
    tid_lists <- list()
    frequent_items <- character()
    
    # Find frequent single items
    for(item in items) {
        tids <- which(apply(trans_matrix, 1, function(x) item %in% x))
        support <- length(tids)/n_trans
        if(support >= min_support) {
            tid_lists[[item]] <- tids
            frequent_items <- c(frequent_items, item)
        }
    }
    
    cat("Frequent items:", length(frequent_items), "\n")
    
    # Store closed itemsets
    closed_itemsets <- list()
    
    # Find closed itemsets
    find_closed_itemsets <- function(prefix = character(), prefix_tids = NULL, 
                                   remaining_items = frequent_items) {
        for(i in seq_along(remaining_items)) {
            item <- remaining_items[i]
            new_tids <- if(is.null(prefix_tids)) tid_lists[[item]]
                       else intersect(prefix_tids, tid_lists[[item]])
            
            support <- length(new_tids)/n_trans
            
            if(support >= min_support) {
                new_itemset <- c(prefix, item)
                
                # Check if closed
                is_closed <- TRUE
                for(existing in names(closed_itemsets)) {
                    if(all(new_itemset %in% strsplit(existing, ",")[[1]]) && 
                       closed_itemsets[[existing]]$support == support) {
                        is_closed <- FALSE
                        break
                    }
                }
                
                if(is_closed) {
                    closed_itemsets[[paste(new_itemset, collapse=",")]] <<- list(
                        items = new_itemset,
                        support = support,
                        tids = new_tids
                    )
                }
                
                # Recursive call with remaining items
                if(i < length(remaining_items)) {
                    find_closed_itemsets(new_itemset, new_tids, 
                                       remaining_items[(i+1):length(remaining_items)])
                }
            }
        }
    }
    
    # Generate closed itemsets
    cat("Generating closed itemsets...\n")
    find_closed_itemsets()
    cat("Found", length(closed_itemsets), "closed itemsets\n")
    
    # Generate rules from closed itemsets
    rules <- data.frame(
        lhs = character(),
        rhs = character(),
        support = numeric(),
        confidence = numeric(),
        lift = numeric(),
        stringsAsFactors = FALSE
    )
    
    # Generate rules
    cat("Generating association rules...\n")
    for(itemset_key in names(closed_itemsets)) {
        itemset <- closed_itemsets[[itemset_key]]
        if(length(itemset$items) >= 2) {
            for(i in 1:(length(itemset$items)-1)) {
                combs <- combn(itemset$items, i, simplify = FALSE)
                for(lhs in combs) {
                    rhs <- setdiff(itemset$items, lhs)
                    
                    # Calculate confidence
                    lhs_support <- max(sapply(names(closed_itemsets), function(key) {
                        if(all(lhs %in% closed_itemsets[[key]]$items))
                            return(closed_itemsets[[key]]$support)
                        return(0)
                    }))
                    
                    if(lhs_support > 0) {
                        confidence <- itemset$support / lhs_support
                        
                        if(confidence >= min_confidence) {
                            # Calculate lift
                            rhs_support <- max(sapply(names(closed_itemsets), function(key) {
                                if(all(rhs %in% closed_itemsets[[key]]$items))
                                    return(closed_itemsets[[key]]$support)
                                return(0)
                            }))
                            
                            lift <- confidence / rhs_support
                            
                            rules[nrow(rules) + 1,] <- list(
                                lhs = paste(lhs, collapse=", "),
                                rhs = paste(rhs, collapse=", "),
                                support = itemset$support,
                                confidence = confidence,
                                lift = lift
                            )
                        }
                    }
                }
            }
        }
    }
    
    return(rules)
}

# 3. Apply CHARM to dataset
selected_data <- select(hepatitis_categorized, 
                       Age_Group, ALT_Level, AST_Level, Disease_Status)
rules <- charm_association_rules(selected_data, min_support = 0.05, min_confidence = 0.5)
Initial items: 13 
Frequent items: 10 
Generating closed itemsets...
Found 24 closed itemsets
Generating association rules...
# 4. Visualize results
if(nrow(rules) > 0) {
    # Scatter plot
    scatter_plot <- plot_ly(rules,
        x = ~confidence,
        y = ~lift,
        color = ~support,
        type = "scatter",
        mode = "markers",
        text = ~paste("Rule:", lhs, "=>", rhs,
                     "<br>Support:", round(support, 4),
                     "<br>Confidence:", round(confidence, 4),
                     "<br>Lift:", round(lift, 4))
    ) %>%
        layout(
            title = "CHARM Algotrithm",
            xaxis = list(title = "Confidence"),
            yaxis = list(title = "Lift"),
            colorbar = list(title = "Support")
        )
    
    # Print statistics
    cat("\nAssociation Rule Analysis Results:\n")
    cat("Total rules found:", nrow(rules), "\n")
    cat("Average confidence:", round(mean(rules$confidence), 4), "\n")
    cat("Average lift:", round(mean(rules$lift), 4), "\n")
    cat("Average support:", round(mean(rules$support), 4), "\n")
    
    # Print top rules
    cat("\nTop 10 Rules by Lift:\n")
    print(head(rules[order(-rules$lift), ], 10))
    
    # Save results
    write.csv(rules, "charm_rules.csv", row.names = FALSE)
} else {
    cat("No rules found. Try adjusting the minimum support or confidence thresholds.\n")
}

Association Rule Analysis Results:
Total rules found: 23 
Average confidence: 0.8 
Average lift: 1.0279 
Average support: 0.2454 

Top 10 Rules by Lift:
                               lhs                     rhs    support
12 ALT_Mild_Elevation,  AST_Normal                  Middle 0.05691057
10              ALT_Mild_Elevation                  Middle 0.07154472
4               Young,  AST_Normal              ALT_Normal 0.08780488
14              AST_Mild_Elevation                  Middle 0.05040650
3                Young, ALT_Normal              AST_Normal 0.08780488
8               Middle, ALT_Normal              AST_Normal 0.35934959
18             Senior,  AST_Normal              ALT_Normal 0.26829268
2                            Young ALT_Normal,  AST_Normal 0.08780488
20                      ALT_Normal              AST_Normal 0.75121951
21                      AST_Normal              ALT_Normal 0.75121951
   confidence     lift
12  0.6034483 1.258036
10  0.5789474 1.206958
4   0.9473684 1.120445
14  0.5254237 1.095375
3   0.9152542 1.082464
8   0.9132231 1.080062
18  0.9016393 1.066362
2   0.7941176 1.057105
20  0.8884615 1.050777
21  0.8884615 1.050777
# Display plot
scatter_plot